教程,当然是以官网为主,不过看英文笔记有挑战,简略带领大家一起学习咯: https://satijalab.org/seurat/get_started.html
需要自行下载安装一些必要的R包! 这里只展示安装稳定版的2.3版本
if (!requireNamespace("BiocManager"))
install.packages("BiocManager")
if (!requireNamespace("Seurat"))
BiocManager::install("Seurat")
加载R包
rm(list = ls()) # clear the environment
#load all the necessary libraries
options(warn=-1) # turn off warning message globally
suppressMessages(library(Seurat))
# 加载R包,请注意R包版本,可能会有莫名其妙的版本错误
# 单细胞转录组领域发展太快,不同版本的 同一个R包差异很大。
因为官网指明了有全新版Seurat包(3.x),但是出于GitHub阶段,所以目前我们仍然是介绍2.X版本,如果你一定要尝试3.0版本,使用下面的代码
devtools::install_github(repo = 'satijalab/seurat', ref = 'release/3.0')
我一直强调过,单细胞转录组领域发展太快,不同版本的 同一个R包差异很大,这个 Seurat包 也不例外,大部分的函数都改了,还专门有一个 https://satijalab.org/seurat/essential_commands.html 对照表格供大家学习。
然后也有新的基于3X的教程:https://satijalab.org/seurat/pbmc3k_tutorial.html
总觉得跟python一样,大版本更新,让人很烦。
我们选择 scRNAseq 这个R包。 这个包内置的是 Pollen et al. 2014 数据集,人类单细胞细胞,分成4类,分别是 pluripotent stem cells 分化而成的 neural progenitor cells (“NPC”) ,还有 “GW16” and “GW21” ,“GW21+3” 这种孕期细胞。
大小是50.6 MB,下载需要一点点时间,先安装加载它们。
这个数据集很出名,截止2019年1月已经有近400的引用了,后面的人开发R包算法都会在其上面做测试,比如 SinQC 这篇文章就提到:We applied SinQC to a highly heterogeneous scRNA-seq dataset containing 301 cells (mixture of 11 different cell types) (Pollen et al., 2014).
不过本例子只使用了数据集的4种细胞类型而已,因为 scRNAseq 这个R包就提供了这些,完整的数据是 23730 features, 301 samples 在 https://hemberg-lab.github.io/scRNA.seq.datasets/human/tissues/
这里面的表达矩阵是由 RSEM (Li and Dewey 2011) 软件根据 hg38 RefSeq transcriptome 得到的,总是130个文库,每个细胞测了两次,测序深度不一样。
library(scRNAseq)
## ----- Load Example Data -----
data(fluidigm)
# Set assay to RSEM estimated counts
assay(fluidigm) <- assays(fluidigm)$rsem_counts
ct <- floor(assays(fluidigm)$rsem_counts)
ct[1:4,1:4]
## SRR1275356 SRR1274090 SRR1275251 SRR1275287
## A1BG 0 0 0 0
## A1BG-AS1 0 0 0 0
## A1CF 0 0 0 0
## A2M 0 0 0 33
简单看看表达矩阵的性质,主要是基因数量,细胞数量;以及每个细胞表达基因的数量,和每个基因在多少个细胞里面表达。
fivenum(apply(ct,1,function(x) sum(x>0) ))
## A3GALT2 PKHD1L1 MAP7 ATG2B YWHAZ
## 0 0 5 33 130
boxplot(apply(ct,1,function(x) sum(x>0) ))
fivenum(apply(ct,2,function(x) sum(x>0) ))
## SRR1275252 SRR1275296 SRR1275249 SRR1275289 SRR1275283
## 1418.0 2961.0 3841.5 5381.0 8221.0
hist(apply(ct,2,function(x) sum(x>0) ))
names(metadata(fluidigm))
## [1] "sample_info" "clusters" "which_qc"
meta <- as.data.frame(colData(fluidigm))
counts <- ct
检测了 counts 和 meta 两个变量,后面需要使用
identical(rownames(meta),colnames(counts))
## [1] TRUE
这里需要把Pollen的表达矩阵做成我们的Seurat要求的对象
Pollen <- CreateSeuratObject(raw.data = counts,
meta.data =meta,
min.cells = 3,
min.genes = 200,
project = "Pollen")
Pollen
## An object of class seurat in project Pollen
## 14656 genes across 130 samples.
## 后续所有的分析都基于这个 Pollen 变量,是一个对象
# An object of class seurat in project Pollen
将Pollen赋值给sce,目的是代码复用
sce <- Pollen
为元信息增加线粒体基因的比例,如果线粒体基因所占比例过高,意味着这可能是死细胞
mito.genes <- grep(pattern = "^MT-", x = rownames(x = sce@data), value = TRUE)
# 恰好这个例子的表达矩阵里面没有线粒体基因
percent.mito <- Matrix::colSums(sce@raw.data[mito.genes, ]) / Matrix::colSums(sce@raw.data)
## 也可以加入很多其它属性,比如 ERCC 等。
# AddMetaData adds columns to object@meta.data, and is a great place to stash QC stats
sce <- AddMetaData(object = sce, metadata = percent.mito,
col.name = "percent.mito")
这里绘图,可以指定分组,前提是这个分组变量存在于meta信息里面
这里的例子是:‘Biological_Condition’
VlnPlot(object = sce, features.plot = c("nGene", "nUMI", "percent.mito"), group.by = 'Biological_Condition', nCol = 3)
GenePlot(object = sce, gene1 = "nUMI", gene2 = "nGene")
可以看看高表达量基因是哪些
tail(sort(Matrix::rowSums(sce@raw.data)))
## SOX11 EEF1A1 SOX4 MAP1B TUBA1A MALAT1
## 1135137 1517751 1658262 2018553 2219675 2985038
GenePlot(object = sce, gene1 = "SOX11", gene2 = "EEF1A1")
CellPlot(sce,sce@cell.names[3],sce@cell.names[4],do.ident = FALSE)
起初sce对象里面的data就是原始表达矩阵
#
identical(sce@raw.data,sce@data)
## [1] TRUE
sce <- NormalizeData(object = sce,
normalization.method = "LogNormalize",
scale.factor = 10000,
display.progress = F)
经过了归一化,sce对象里面的data被改变。
identical(sce@raw.data,sce@data)
## [1] FALSE
寻找波动比较明显的基因,后续用这些基因而非全部基因进行分析,主要为了降低计算量。
sce <- FindVariableGenes(object = sce, mean.function = ExpMean, dispersion.function = LogVMR,
x.low.cutoff = 0.0125,
x.high.cutoff = 3,
y.cutoff = 0.5)
# 通过调整参数可以得到不同数量的 var.genes
length(sce@var.genes)
## [1] 4944
对矩阵进行回归建模,以及scale
sce <- ScaleData(object = sce,
vars.to.regress = c("nUMI"),
display.progress = F)
现在sce对象的 sce@scale.data 也有了数值
运行PCA进行线性降维
sce <- RunPCA(object = sce,
pc.genes = sce@var.genes,
do.print = TRUE,
pcs.print = 1:5,
genes.print = 5)
## [1] "PC1"
## [1] "MLLT11" "KIDINS220" "SLA" "CALM1" "FAM110B"
## [1] ""
## [1] "HMGB2" "LIX1" "LDHA" "TRIM59" "ENO1"
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "CENPF" "SHISA2" "CDK1" "HIST1H4C" "BARD1"
## [1] ""
## [1] "ECSCR" "MYCT1" "ELK3" "IMPG2" "ADGRF5"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "ADGRV1" "GPX3" "FAM107A" "HOPX" "PTN"
## [1] ""
## [1] "BCAT1" "EFNA5" "DLK1" "TUBA1C" "CXADR"
## [1] ""
## [1] ""
## [1] "PC4"
## [1] "S100A6" "LOX" "ADAMTS16" "CA2" "DKK3"
## [1] ""
## [1] "FAM50A" "DDX54" "ARFIP2" "IPP" "PXMP2"
## [1] ""
## [1] ""
## [1] "PC5"
## [1] "ASB16-AS1" "CENPL" "ATP1A1-AS1" "CHD9" "KLHL31"
## [1] ""
## [1] "CD44" "CA2" "SYNJ2" "LRP10" "TTC38"
## [1] ""
## [1] ""
sce@dr
## $pca
## A dimensional reduction object with key PC
## Number of dimensions: 20
## Projected dimensional reduction calculated: FALSE
## Jackstraw run: FALSE
这样就能拿到PC的基因的重要性占比情况。
tmp <- sce@dr$pca@gene.loadings
VizPCA( sce, pcs.use = 1:2)
PCAPlot(sce, dim.1 = 1, dim.2 = 2,
group.by = 'Biological_Condition')
sce <- ProjectPCA(sce, do.print = FALSE)
因为细胞数量不多,所以可以全部画出来
PCHeatmap(object = sce,
pc.use = 1,
cells.use = ncol(sce@data),
do.balanced = TRUE,
label.columns = FALSE)
PCHeatmap(object = sce,
pc.use = 1:10,
cells.use = ncol(sce@data),
do.balanced = TRUE,
label.columns = FALSE)
重点: 需要搞懂这里的 resolution 参数
sce <- FindClusters(object = sce,
reduction.type = "pca",
dims.use = 1:10, force.recalc = T,
resolution = 0.9, print.output = 0,
save.SNN = TRUE)
PrintFindClustersParams(sce)
## Parameters used in latest FindClusters calculation run on: 2019-02-25 15:46:49
## =============================================================================
## Resolution: 0.9
## -----------------------------------------------------------------------------
## Modularity Function Algorithm n.start n.iter
## 1 1 100 10
## -----------------------------------------------------------------------------
## Reduction used k.param prune.SNN
## pca 30 0.0667
## -----------------------------------------------------------------------------
## Dims used in calculation
## =============================================================================
## 1 2 3 4 5 6 7 8 9 10
table(sce@meta.data$res.0.9)
##
## 0 1 2
## 60 36 34
sce <- RunTSNE(object = sce,
dims.use = 1:10,
do.fast = TRUE,
perplexity=10)
# note that you can set do.label=T to help label individual clusters
TSNEPlot(object = sce)
可以看到,虽然说有4类细胞,但是 GW16和GW21没有区分开来,需要探索参数。
table(meta$Biological_Condition)
##
## GW16 GW21 GW21+3 NPC
## 52 16 32 30
table(meta$Biological_Condition,sce@meta.data$res.0.9)
##
## 0 1 2
## GW16 46 4 2
## GW21 10 2 4
## GW21+3 4 0 28
## NPC 0 30 0
TSNEPlot(object = sce,group.by = 'Biological_Condition')
# 下面的代码是需要适应性修改,因为每次分组不一样,本次是3组。
markers_df <- FindMarkers(object = sce, ident.1 = 0, min.pct = 0.25)
print(x = head(markers_df))
## p_val avg_logFC pct.1 pct.2 p_val_adj
## UBE2C 1.415982e-13 -1.718926 0.050 0.686 2.075264e-09
## BIRC5 2.768278e-13 -1.826117 0.150 0.757 4.057188e-09
## PRDX3 9.349624e-13 -1.161770 0.500 0.914 1.370281e-08
## PAPSS1 1.460532e-12 -1.108028 0.150 0.757 2.140556e-08
## ARL6IP1 1.571976e-12 -1.175934 0.767 1.000 2.303888e-08
## TOP2A 2.209218e-12 -1.852498 0.233 0.800 3.237830e-08
markers_genes = rownames(head(x = markers_df, n = 5))
VlnPlot(object = sce, features.plot =markers_genes, use.raw = TRUE, y.log = TRUE)
FeaturePlot(object = sce,
features.plot =markers_genes,
cols.use = c("grey", "blue"),
reduction.use = "tsne")
markers_df <- FindMarkers(object = sce, ident.1 = 1, min.pct = 0.25)
print(x = head(markers_df))
## p_val avg_logFC pct.1 pct.2 p_val_adj
## GPC3 2.933407e-23 1.4547443 0.833 0.000 4.299201e-19
## LIX1 1.054475e-22 2.2791850 0.944 0.085 1.545438e-18
## DLK1 2.300166e-22 2.4269428 0.806 0.000 3.371124e-18
## RPS4Y1 2.300166e-22 1.6552985 0.806 0.000 3.371124e-18
## MAD2L1 1.189567e-21 1.6846057 0.944 0.117 1.743429e-17
## IQGAP3 1.021753e-20 0.5417169 0.833 0.043 1.497481e-16
markers_genes = rownames(head(x = markers_df, n = 5))
VlnPlot(object = sce, features.plot =markers_genes, use.raw = TRUE, y.log = TRUE)
FeaturePlot(object = sce,
features.plot =markers_genes,
cols.use = c("grey", "blue"),
reduction.use = "tsne")
markers_df <- FindMarkers(object = sce, ident.1 = 2, min.pct = 0.25)
print(x = head(markers_df))
## p_val avg_logFC pct.1 pct.2 p_val_adj
## SCG5 3.326696e-16 0.8585973 0.824 0.094 4.875606e-12
## CDH13 5.597805e-15 1.2579869 0.735 0.073 8.204142e-11
## TMEM108 7.817508e-15 0.7626764 0.735 0.083 1.145734e-10
## MEF2C 2.908036e-14 2.2290140 0.882 0.292 4.262018e-10
## GRIN2B 3.495319e-14 1.0857874 0.676 0.062 5.122739e-10
## MEG3 1.126700e-13 1.5753348 0.971 0.438 1.651291e-09
markers_genes = rownames(head(x = markers_df, n = 5))
VlnPlot(object = sce, features.plot =markers_genes, use.raw = TRUE, y.log = TRUE)
FeaturePlot(object = sce,
features.plot =markers_genes,
cols.use = c("grey", "blue"),
reduction.use = "tsne")
sce.markers <- FindAllMarkers(object = sce, only.pos = TRUE, min.pct = 0.25,
thresh.use = 0.25)
DT::datatable(sce.markers)
library(dplyr)
sce.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
## # A tibble: 6 x 7
## # Groups: cluster [3]
## p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 1.01e- 5 1.84 0.467 0.114 1.48e- 1 0 ERBB4
## 2 7.88e- 3 1.71 0.3 0.114 1.00e+ 0 0 PDZRN3
## 3 2.30e-22 2.43 0.806 0 3.37e-18 1 DLK1
## 4 1.94e-18 2.47 0.917 0.202 2.84e-14 1 BCAT1
## 5 2.91e-14 2.23 0.882 0.292 4.26e-10 2 MEF2C
## 6 6.21e- 7 1.86 0.588 0.229 9.10e- 3 2 LINC00643
top10 <- sce.markers %>% group_by(cluster) %>% top_n(10, avg_logFC)
# setting slim.col.label to TRUE will print just the cluster IDS instead of# every cell name
DoHeatmap(object = sce, genes.use = top10$gene, slim.col.label = TRUE, remove.key = TRUE)
FeaturePlot(object = sce,
features.plot =top10$gene,
cols.use = c("grey", "blue"),
reduction.use = "tsne")
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 dplyr_0.7.8
## [3] scRNAseq_1.8.0 SummarizedExperiment_1.12.0
## [5] DelayedArray_0.8.0 BiocParallel_1.14.2
## [7] matrixStats_0.54.0 Biobase_2.42.0
## [9] GenomicRanges_1.34.0 GenomeInfoDb_1.16.0
## [11] IRanges_2.16.0 S4Vectors_0.20.1
## [13] BiocGenerics_0.28.0 Seurat_2.3.4
## [15] Matrix_1.2-14 cowplot_0.9.3
## [17] ggplot2_3.1.0
##
## loaded via a namespace (and not attached):
## [1] snow_0.4-3 backports_1.1.3 Hmisc_4.2-0
## [4] plyr_1.8.4 igraph_1.2.2 lazyeval_0.2.1
## [7] splines_3.5.1 crosstalk_1.0.0 digest_0.6.18
## [10] foreach_1.4.4 htmltools_0.3.6 lars_1.2
## [13] gdata_2.18.0 fansi_0.4.0 magrittr_1.5
## [16] checkmate_1.9.1 cluster_2.0.7-1 mixtools_1.1.0
## [19] ROCR_1.0-7 R.utils_2.7.0 colorspace_1.4-0
## [22] xfun_0.4 crayon_1.3.4 RCurl_1.95-4.11
## [25] jsonlite_1.6 bindr_0.1.1 survival_2.42-3
## [28] zoo_1.8-4 iterators_1.0.10 ape_5.2
## [31] glue_1.3.0 gtable_0.2.0 zlibbioc_1.26.0
## [34] XVector_0.22.0 kernlab_0.9-27 prabclus_2.2-6
## [37] DEoptimR_1.0-8 scales_1.0.0 mvtnorm_1.0-8
## [40] bibtex_0.4.2 Rcpp_1.0.0 metap_1.0
## [43] dtw_1.20-1 xtable_1.8-3 htmlTable_1.13.1
## [46] reticulate_1.10 foreign_0.8-70 bit_1.1-14
## [49] proxy_0.4-22 mclust_5.4.1 SDMTools_1.1-221
## [52] Formula_1.2-3 tsne_0.1-3 DT_0.5
## [55] htmlwidgets_1.3 httr_1.3.1 gplots_3.0.1
## [58] RColorBrewer_1.1-2 fpc_2.1-11.1 acepack_1.4.1
## [61] modeltools_0.2-22 ica_1.0-2 pkgconfig_2.0.2
## [64] R.methodsS3_1.7.1 flexmix_2.3-14 nnet_7.3-12
## [67] utf8_1.1.4 tidyselect_0.2.5 labeling_0.3
## [70] rlang_0.3.1 reshape2_1.4.3 later_0.7.5
## [73] munsell_0.5.0 tools_3.5.1 cli_1.0.1
## [76] ggridges_0.5.1 evaluate_0.12 stringr_1.3.1
## [79] yaml_2.2.0 npsurv_0.4-0 knitr_1.21
## [82] bit64_0.9-7 fitdistrplus_1.0-11 robustbase_0.93-3
## [85] caTools_1.17.1.1 purrr_0.3.0 RANN_2.6
## [88] pbapply_1.3-4 nlme_3.1-137 mime_0.6
## [91] R.oo_1.22.0 hdf5r_1.0.1 compiler_3.5.1
## [94] rstudioapi_0.9.0 png_0.1-7 lsei_1.2-0
## [97] tibble_2.0.1 stringi_1.2.4 lattice_0.20-35
## [100] trimcluster_0.1-2.1 pillar_1.3.1 BiocManager_1.30.3
## [103] Rdpack_0.10-1 lmtest_0.9-36 data.table_1.12.0
## [106] bitops_1.0-6 irlba_2.3.2 gbRd_0.4-11
## [109] httpuv_1.4.5 R6_2.3.0 latticeExtra_0.6-28
## [112] promises_1.0.1 KernSmooth_2.23-15 gridExtra_2.3
## [115] codetools_0.2-15 MASS_7.3-50 gtools_3.8.1
## [118] assertthat_0.2.0 rprojroot_1.3-2 withr_2.1.2
## [121] GenomeInfoDbData_1.1.0 diptest_0.75-7 doSNOW_1.0.16
## [124] grid_3.5.1 rpart_4.1-13 tidyr_0.8.1
## [127] class_7.3-14 rmarkdown_1.10 segmented_0.5-3.0
## [130] Rtsne_0.15 shiny_1.1.0 base64enc_0.1-3